xarray netcdf
Open netcdf
combine_nested() requires specifying the order in which the objects should be combined, while combine_by_coords() attempts to infer this ordering automatically from the coordinates in the data.
open_mfdataset() called without chunks argument will return dask arrays with chunk sizes equal to the individual files. Re-chunking the dataset after creation with ds.chunk() will lead to an ineffective use of memory and is not recommended.
code: python
xr.open_mfdataset('my/files/*.nc', parallel=True)
code:pyhon
xr.open_mfdataset('my/files/*.nc', concat_dim="time", combine="nested",
data_vars='minimal', coords='minimal', compat='override')
# cobine="nested" when files are sorted
# only those that already contain the "time" dimension
(data_vars='minimal', coords='minimal').
# Variables that lack the "time" dimension are taken from the first dataset
(compat='override').
Preprocessing before concat
code:python
def read_netcdfs(files, dim, transform_func=None):
def process_one_path(path):
# use a context manager, to ensure the file gets closed after use
with xr.open_dataset(path) as ds:
# transform_func should do some sort of selection or
# aggregation
if transform_func is not None:
ds = transform_func(ds)
# load all data from the transformed dataset, to ensure we can
# use it after closing each original file
ds.load()
return ds
paths = sorted(glob(files))
combined = xr.concat(datasets, dim)
return combined
# here we suppose we only care about the combined mean of each file;
# you might also use indexing operations like .sel to subset datasets
combined = read_netcdfs('/all/my/files/*.nc', dim='time',
transform_func=lambda ds: ds.mean())
code:python
ds = xr.open_mfdataset(
files,
# Name of the dimension to concatenate along.
concat_dim="time",
# Attempt to auto-magically combine the given datasets into one by using dimension coordinates.
combine="by_coords", # when files are not sorted
# Specify chunks for dask - explained later
chunks={"lev": 1, "time": 500},
# Only data variables in which the dimension already appears are included.
data_vars="minimal",
# Only coordinates in which the dimension already appears are included.
coords="minimal",
# Skip comparing and pick variable from first dataset.
compat="override",
parallel=True,
)
code:python
import xarray as xr
import glob
import dask
import logging
from dask.distributed import Client
client = Client(threads_per_worker=1,memory_limit=0,silence_logs=logging.ERROR)
client
def _preprocess(ds):
return ds.sel(level=850,latitude=-37.81, longitude=144.96, method='nearest')
era5_vert_winds_files=glob.glob('/g/data/rt52/era5/pressure-levels/monthly-averaged/w/20*/*.nc')
ds_850_mel=xr.open_mfdataset(era5_vert_winds_files,parallel=True,preprocess=_preprocess)
ds_850_mel
Writing netcdf
By setting the compute argument to False, to_netcdf() will return a dask.delayed object that can be computed later.
code:python
from dask.diagnostics import ProgressBar
# or distributed.progress when using the distributed scheduler
delayed_obj = ds.to_netcdf("manipulated-example-data.nc", compute=False)
with ProgressBar():
results = delayed_obj.compute()